{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "kernel: https://github.com/mahmoodlab/HEST"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from huggingface_hub import login\n",
    "\n",
    "login(token=\"\") # token=Your_HuggingFace_Token\n",
    "\n",
    "\n",
    "# load metadata\n",
    "import pandas as pd\n",
    "meta_df = pd.read_csv(\"hf://datasets/MahmoodLab/hest/HEST_v1_1_0.csv\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Dataset - Human Kidney"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import datasets\n",
    "import os\n",
    "\n",
    "data_path = \"./hest1k_datasets/kidney\"\n",
    "os.makedirs(data_path)\n",
    "\n",
    "ids_to_query = meta_df.loc[meta_df[\"dataset_title\"].str.startswith(\"Spatial localization with Spatial Transcriptomics for an atlas of healthy and injured cell states\"), \"id\"].tolist()\n",
    "list_patterns = [f\"*{id}[_.]**\" for id in ids_to_query]\n",
    "dataset = datasets.load_dataset(\n",
    "    'MahmoodLab/hest', \n",
    "    cache_dir=data_path,\n",
    "    patterns=list_patterns\n",
    ")\n",
    "\n",
    "# HEST1k recorded patient ID in meta_df[\"subseries\"]\n",
    "# In our paper, we used specimen ID instead of patient ID to refer to the slide.\n",
    "# For the ID conversions, please check supplementary table 2 (col C and col G) in original paper: https://www.nature.com/articles/s41586-023-05769-3#Sec66\n",
    "\n",
    "# Test slide in the paper: NCBI697 (20-0038 AKI)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Dataset - Human HER2ST"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import datasets\n",
    "import os\n",
    "\n",
    "data_path = \"./hest1k_datasets/her2st\"\n",
    "os.makedirs(data_path)\n",
    "\n",
    "ids_to_query = meta_df.loc[meta_df[\"dataset_title\"].str.startswith(\"Spatial deconvolution of\"), \"id\"].tolist()\n",
    "list_patterns = [f\"*{id}[_.]**\" for id in ids_to_query]\n",
    "dataset = datasets.load_dataset(\n",
    "    'MahmoodLab/hest', \n",
    "    cache_dir=data_path,\n",
    "    patterns=list_patterns\n",
    ")\n",
    "\n",
    "# Slide ID in our paper is in meta_df[\"subseries\"].\n",
    "\n",
    "# Test slide in the paper: SPA148 (B1) or SPA123 (G2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Dataset - Human Prostate Cancer (PRAD)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import datasets\n",
    "import os\n",
    "\n",
    "data_path = \"./hest1k_datasets/PRAD\"\n",
    "os.makedirs(data_path)\n",
    "\n",
    "ids_to_query = [\"MEND\"+str(i) for i in range(139, 163)] # ID: MEND139~MEND162\n",
    "list_patterns = [f\"*{id}[_.]**\" for id in ids_to_query]\n",
    "dataset = datasets.load_dataset(\n",
    "    'MahmoodLab/hest', \n",
    "    cache_dir=data_path,\n",
    "    patterns=list_patterns\n",
    ")\n",
    "\n",
    "# Test slide in the paper: MEND145"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Dataset - Mouse Brain"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import datasets\n",
    "import os\n",
    "\n",
    "data_path = \"./hest1k_datasets/mouse_brain\"\n",
    "os.makedirs(data_path)\n",
    "\n",
    "ids_to_query = meta_df.loc[(meta_df[\"dataset_title\"].str.startswith(\"Spatial Multimodal Analysis\")) & (meta_df[\"disease_state\"] == \"Healthy\"), \"id\"].tolist()\n",
    "list_patterns = [f\"*{id}[_.]**\" for id in ids_to_query]\n",
    "dataset = datasets.load_dataset(\n",
    "    'MahmoodLab/hest', \n",
    "    cache_dir=data_path,\n",
    "    patterns=list_patterns\n",
    ")\n",
    "\n",
    "# Test slide in the paper: NCBI667"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "hest",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.20"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
